Lalonde_data = read.csv("Lalonde_data.csv", header=T)
Lalonde_data = Lalonde_data[,-c(1,14)]
Lalonde_data = Lalonde_data[,c(3:12,2,1)]
head(Lalonde_data)


#install.packages("randomForest")
library(randomForest)

set.seed(1)

number_of_bootstrap_draws = 10^3

treatment_effect_estimates = rep(NA, number_of_bootstrap_draws)

Lalonde_data_treated = Lalonde_data[Lalonde_data$TREAT==1,]
Lalonde_data_control = Lalonde_data[Lalonde_data$TREAT==0,]

Lalonde_randomForest_treated = randomForest(x=Lalonde_data_treated[,1:10], 
                                            y=Lalonde_data_treated[,12])

Lalonde_randomForest_control = randomForest(x=Lalonde_data_control[,1:10], 
                                            y=Lalonde_data_control[,12])  
  
treatment_effect_estimate = mean(c((Lalonde_data_treated[,12] - 
                                    predict(Lalonde_randomForest_control, Lalonde_data_treated[,1:10])),
                                   (predict(Lalonde_randomForest_treated, Lalonde_data_control[,1:10]) - 
                                    Lalonde_data_control[,12])))

print(paste("The treatment effect estimate obtained from random forests is:", treatment_effect_estimate))

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  Lalonde_data_new_treated = Lalonde_data_treated[sample((1:nrow(Lalonde_data_treated)), replace=TRUE),]
  Lalonde_randomForest_new_treated = randomForest(x=Lalonde_data_new_treated[,1:10], 
                                                  y=Lalonde_data_new_treated[,12])
  Lalonde_data_new_control = Lalonde_data_control[sample((1:nrow(Lalonde_data_control)), replace=TRUE),]
  Lalonde_randomForest_new_control = randomForest(x=Lalonde_data_new_control[,1:10], 
                                                  y=Lalonde_data_new_control[,12])  
  
  treatment_effect_estimates[i] = mean(c((Lalonde_data_new_treated[,12] - 
                                          predict(Lalonde_randomForest_new_control, Lalonde_data_new_treated[,1:10])),
                                         (predict(Lalonde_randomForest_new_treated, Lalonde_data_new_control[,1:10]) - 
                                          Lalonde_data_new_control[,12])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)

hist(treatment_effect_estimates, main="Bootstrap Distribution of Estimates of Causal Estimand", xlab="Treatment Effect")

mean(treatment_effect_estimates)
sd(treatment_effect_estimates)
quantile(treatment_effect_estimates, prob=c(0.025,0.25,0.5,0.75,0.975))

randomForest 4.6-14

Type rfNews() to see new features/changes/bug fixes.

[1] "The treatment effect estimate obtained from random forests is: 1625.08650762209"
  |======================================================================| 100%


Lalonde_linear_model = lm(RE78 ~ MARR + NODEGREE + BLACK + HISPANIC + EDUC + AGE + RE74 + RE75 + U74 + U75 + TREAT,
                          data=Lalonde_data)

summary(Lalonde_linear_model)

#1.671e+03 + c(-1,1)*qt(0.975, df=433)*6.411e+02

Call:
lm(formula = RE78 ~ MARR + NODEGREE + BLACK + HISPANIC + EDUC + 
    AGE + RE74 + RE75 + U74 + U75 + TREAT, data = Lalonde_data)

Residuals:
   Min     1Q Median     3Q    Max 
 -9612  -4355  -1572   3054  53119 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept)  2.567e+02  3.522e+03   0.073  0.94193   
MARR        -1.463e+02  8.823e+02  -0.166  0.86835   
NODEGREE    -1.518e+01  1.006e+03  -0.015  0.98796   
BLACK       -2.037e+03  1.174e+03  -1.736  0.08331 . 
HISPANIC     4.258e+02  1.565e+03   0.272  0.78562   
EDUC         4.008e+02  2.288e+02   1.751  0.08058 . 
AGE          5.357e+01  4.581e+01   1.170  0.24284   
RE74         1.234e-01  8.784e-02   1.405  0.16080   
RE75         1.974e-02  1.503e-01   0.131  0.89554   
U74          1.380e+03  1.188e+03   1.162  0.24590   
U75         -1.071e+03  1.025e+03  -1.045  0.29651   
TREAT        1.671e+03  6.411e+02   2.606  0.00948 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6517 on 433 degrees of freedom
Multiple R-squared:  0.05822,	Adjusted R-squared:  0.0343 
F-statistic: 2.433 on 11 and 433 DF,  p-value: 0.005974


#install.packages("vioplot")
library(vioplot)

options(warn=-1)

experiment_data = Lalonde_data

numerical_covariate_balance = function(treated, control, covariate_name)
{
  mean_t = mean(treated)
  mean_c = mean(control)
  
  var_t = var(treated)
  var_c = var(control)
  pooled_var = (var_t*(length(treated)-1) + var_c*(length(control)-1))/(length(treated)+length(control)-2)
  standard_err = (pooled_var*(1/length(treated) + 1/length(control)))^0.5
  
  degrees_of_freedom = length(treated) + length(control) - 2
  
  test_statistic = (mean_t - mean_c)/(standard_err)
  
  p_value = 2*min(pt(test_statistic, df=degrees_of_freedom, lower.tail=TRUE), 
                  pt(test_statistic, df=degrees_of_freedom, lower.tail=FALSE))

  vioplot(treated, control, names=c("Treatment", "Control"), col="cyan", horizontal=TRUE, side="right")

  print(paste(covariate_name,":", mean_t, ",", mean_c, ",", p_value))
}

categorical_covariate_balance = function(treated, control, covariate_name)
{
    p_value = prop.test(x=c(sum(treated),sum(control)), 
                        n=c(length(treated), length(control)), 
                        alternative="two.sided", correct=TRUE)$p.value
    
    print(paste(covariate_name,":", mean(treated), ",", mean(control), ",", p_value))
}


categorical_covariate_balance(experiment_data$MARR[experiment_data$TREAT==1], 
                              experiment_data$MARR[experiment_data$TREAT==0],
                             "Marriage")
categorical_covariate_balance(experiment_data$NODEGREE[experiment_data$TREAT==1],
                              experiment_data$NODEGREE[experiment_data$TREAT==0],
                             "No High School Degree")
categorical_covariate_balance(experiment_data$BLACK[experiment_data$TREAT==1], 
                              experiment_data$BLACK[experiment_data$TREAT==0],
                             "African-American")
categorical_covariate_balance(experiment_data$HISPANIC[experiment_data$TREAT==1],
                              experiment_data$HISPANIC[experiment_data$TREAT==0],
                             "Hispanic")
categorical_covariate_balance(experiment_data$U74[experiment_data$TREAT==1], 
                              experiment_data$U74[experiment_data$TREAT==0],
                             "Unemployed in 1974")
categorical_covariate_balance(experiment_data$U75[experiment_data$TREAT==1],
                              experiment_data$U75[experiment_data$TREAT==0],
                             "Unemployed in 1975")

numerical_covariate_balance(experiment_data$EDUC[experiment_data$TREAT==1], 
                            experiment_data$EDUC[experiment_data$TREAT==0], 
                            "Education")
numerical_covariate_balance(experiment_data$AGE[experiment_data$TREAT==1],
                            experiment_data$AGE[experiment_data$TREAT==0],
                           "Age")
numerical_covariate_balance(experiment_data$RE74[experiment_data$TREAT==1], 
                            experiment_data$RE74[experiment_data$TREAT==0],
                           "1974 Income")
numerical_covariate_balance(experiment_data$RE75[experiment_data$TREAT==1],
                            experiment_data$RE75[experiment_data$TREAT==0],
                           "1975 Income")

Loading required package: sm

Package 'sm', version 2.2-5.6: type help(sm) for summary information

Loading required package: zoo


Attaching package: 'zoo'


The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

[1] "Marriage : 0.189189189189189 , 0.153846153846154 , 0.393600016056372"
[1] "No High School Degree : 0.708108108108108 , 0.834615384615385 , 0.00214685437568014"
[1] "African-American : 0.843243243243243 , 0.826923076923077 , 0.744021068468005"
[1] "Hispanic : 0.0594594594594595 , 0.107692307692308 , 0.108868987268592"
[1] "Unemployed in 1974 : 0.708108108108108 , 0.75 , 0.381380629582063"
[1] "Unemployed in 1975 : 0.6 , 0.684615384615385 , 0.0813493975970139"
[1] "Education : 10.3459459459459 , 10.0884615384615 , 0.135411167169439"

[1] "Age : 25.8162162162162 , 25.0538461538462 , 0.264764268805686"

[1] "1974 Income : 2095.57405405405 , 2107.02538461538 , 0.982320764091842"

[1] "1975 Income : 1532.05675675676 , 1266.91 , 0.382253154580966"


observational_data = read.table("Lalonde_observational_data.txt", header=T, sep="")

observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

observational_randomForest_treated = randomForest(x=observational_data_treated[,3:12], 
                                                  y=observational_data_treated[,1])

observational_randomForest_control = randomForest(x=observational_data_control[,3:12], 
                                                  y=observational_data_control[,1])  
  
treatment_effect_estimate = mean(c((observational_data_treated[,1] - 
                                    predict(observational_randomForest_control, observational_data_treated[,3:12])),
                                   (predict(observational_randomForest_treated, observational_data_control[,3:12]) - 
                                    observational_data_control[,1])))

print(paste("The treatment effect estimate obtained from random forests is:", treatment_effect_estimate))

number_of_bootstrap_draws = 10^3

treatment_effect_estimates = rep(NA, number_of_bootstrap_draws)

observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates[i] = mean(c((observational_data_new_treated[,1] - 
                                          predict(observational_randomForest_new_control,
                                                  observational_data_new_treated[,3:12])),
                                         (predict(observational_randomForest_new_treated, 
                                                  observational_data_new_control[,3:12]) - 
                                          observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)

hist(treatment_effect_estimates, main="Bootstrap Distribution of Estimates of Causal Estimand", xlab="Treatment Effect")

mean(treatment_effect_estimates)
sd(treatment_effect_estimates)
quantile(treatment_effect_estimates, prob=c(0.025,0.25,0.5,0.75,0.975))

[1] "The treatment effect estimate obtained from random forests is: -9614.72259267092"
  |======================================================================| 100%


categorical_covariate_balance(observational_data$MARR[observational_data$TREAT==1], 
                              observational_data$MARR[observational_data$TREAT==0],
                             "Marriage")
categorical_covariate_balance(observational_data$NODEGREE[observational_data$TREAT==1],
                              observational_data$NODEGREE[observational_data$TREAT==0],
                             "No High School Degree")
categorical_covariate_balance(observational_data$BLACK[observational_data$TREAT==1], 
                              observational_data$BLACK[observational_data$TREAT==0],
                             "African-American")
categorical_covariate_balance(observational_data$HISPANIC[observational_data$TREAT==1],
                              observational_data$HISPANIC[observational_data$TREAT==0],
                             "Hispanic")
categorical_covariate_balance(observational_data$U74[observational_data$TREAT==1], 
                              observational_data$U74[observational_data$TREAT==0],
                             "Unemployed in 1974")
categorical_covariate_balance(observational_data$U75[observational_data$TREAT==1],
                              observational_data$U75[observational_data$TREAT==0],
                             "Unemployed in 1975")

numerical_covariate_balance(observational_data$EDUC[observational_data$TREAT==1], 
                            observational_data$EDUC[observational_data$TREAT==0], 
                            "Education")
numerical_covariate_balance(observational_data$AGE[observational_data$TREAT==1],
                            observational_data$AGE[observational_data$TREAT==0],
                           "Age")
numerical_covariate_balance(observational_data$RE74[observational_data$TREAT==1], 
                            observational_data$RE74[observational_data$TREAT==0],
                           "1974 Income")
numerical_covariate_balance(observational_data$RE75[observational_data$TREAT==1],
                            observational_data$RE75[observational_data$TREAT==0],
                           "1975 Income")

[1] "Marriage : 0.189189189189189 , 0.866265060240964 , 4.68007728305411e-117"
[1] "No High School Degree : 0.708108108108108 , 0.305220883534137 , 8.32164017836424e-29"
[1] "African-American : 0.843243243243243 , 0.250602409638554 , 5.12399430234964e-65"
[1] "Hispanic : 0.0594594594594595 , 0.0325301204819277 , 0.0836134327225996"
[1] "Unemployed in 1974 : 0.708108108108108 , 0.0863453815261044 , 2.22130836324017e-129"
[1] "Unemployed in 1975 : 0.6 , 0.1 , 1.91495775994449e-81"
[1] "Education : 10.3459459459459 , 12.1168674698795 , 2.00780829375752e-14"

[1] "Age : 25.8162162162162 , 34.8506024096386 , 3.09494054870162e-30"

[1] "1974 Income : 2095.57405405405 , 19428.7458428916 , 5.60415114162859e-65"

[1] "1975 Income : 1532.05675675676 , 19063.3375870281 , 5.44370824933261e-65"


propensity_score_model = glm(TREAT ~ MARR + NODEGREE + BLACK + HISPANIC + EDUC + AGE + RE74 + RE75 + U74 + U75, 
                             data=observational_data, family=binomial)

propensity_scores = propensity_score_model$fitted.values

hist(propensity_scores[1:185], main="Estimated Propensity Scores for Treated Units", xlab="Propensity Score")
hist(propensity_scores[186:2675], main="Estimated Propensity Scores for Control Units", xlab="Propensity Score")

min_treat_prop = min(propensity_scores[1:185])
max_treat_prop = max(propensity_scores[1:185])

data = cbind(observational_data, propensity_scores)
treated_units = data[1:185,]
new_control_units = data[data$TREAT==0 & data$propensity_scores>=min_treat_prop & data$propensity_scores<=max_treat_prop,]
new_data = rbind(treated_units, new_control_units)

hist(propensity_scores[1:185], main="Estimated Propensity Scores for Treated Units", xlab="Propensity Score")
hist(new_control_units[,14], main="Estimated Propensity Scores for Remaining Control Units", xlab="Propensity Score")

categorical_covariate_balance(new_data$MARR[new_data$TREAT==1], 
                              new_data$MARR[new_data$TREAT==0],
                             "Marriage")
categorical_covariate_balance(new_data$NODEGREE[new_data$TREAT==1],
                              new_data$NODEGREE[new_data$TREAT==0],
                             "No High School Degree")
categorical_covariate_balance(new_data$BLACK[new_data$TREAT==1], 
                              new_data$BLACK[new_data$TREAT==0],
                             "African-American")
categorical_covariate_balance(new_data$HISPANIC[new_data$TREAT==1],
                              new_data$HISPANIC[new_data$TREAT==0],
                             "Hispanic")
categorical_covariate_balance(new_data$U74[new_data$TREAT==1], 
                              new_data$U74[new_data$TREAT==0],
                             "Unemployed in 1974")
categorical_covariate_balance(new_data$U75[new_data$TREAT==1],
                              new_data$U75[new_data$TREAT==0],
                             "Unemployed in 1975")

numerical_covariate_balance(new_data$EDUC[new_data$TREAT==1], 
                            new_data$EDUC[new_data$TREAT==0], 
                            "Education")
numerical_covariate_balance(new_data$AGE[new_data$TREAT==1],
                            new_data$AGE[new_data$TREAT==0],
                           "Age")
numerical_covariate_balance(new_data$RE74[new_data$TREAT==1], 
                            new_data$RE74[new_data$TREAT==0],
                           "1974 Income")
numerical_covariate_balance(new_data$RE75[new_data$TREAT==1],
                            new_data$RE75[new_data$TREAT==0],
                           "1975 Income")

[1] "Marriage : 0.189189189189189 , 0.780629139072848 , 1.09554668599274e-59"
[1] "No High School Degree : 0.708108108108108 , 0.410596026490066 , 6.62441037350883e-14"
[1] "African-American : 0.843243243243243 , 0.43294701986755 , 5.84073058508118e-25"
[1] "Hispanic : 0.0594594594594595 , 0.048841059602649 , 0.66360489445241"
[1] "Unemployed in 1974 : 0.708108108108108 , 0.165562913907285 , 5.45838970175597e-58"
[1] "Unemployed in 1975 : 0.6 , 0.204470198675497 , 5.61158183055686e-30"

[1] "Education : 10.3459459459459 , 11.2557947019868 , 7.14192277404495e-05"

[1] "Age : 25.8162162162162 , 31.8998344370861 , 2.60148220574681e-14"

[1] "1974 Income : 2095.57405405405 , 11051.4292944536 , 2.05524196773978e-44"

[1] "1975 Income : 1532.05675675676 , 9359.71751937086 , 6.39731949833502e-48"


observational_data_treated = new_data[new_data$TREAT==1,]
observational_data_control = new_data[new_data$TREAT==0,]

observational_randomForest_treated = randomForest(x=observational_data_treated[,3:12], 
                                                  y=observational_data_treated[,1])

observational_randomForest_control = randomForest(x=observational_data_control[,3:12], 
                                                  y=observational_data_control[,1])  
  
treatment_effect_estimate = mean(c((observational_data_treated[,1] - 
                                    predict(observational_randomForest_control, observational_data_treated[,3:12])),
                                   (predict(observational_randomForest_treated, observational_data_control[,3:12]) - 
                                    observational_data_control[,1])))

print(paste("The treatment effect estimate obtained from random forests is:", treatment_effect_estimate))


number_of_bootstrap_draws = 10^3

treatment_effect_estimates = rep(NA, number_of_bootstrap_draws)

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates[i] = mean(c((observational_data_new_treated[,1] - 
                                          predict(observational_randomForest_new_control,
                                                  observational_data_new_treated[,3:12])),
                                         (predict(observational_randomForest_new_treated, 
                                                  observational_data_new_control[,3:12]) - 
                                          observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)


hist(treatment_effect_estimates, main="Bootstrap Distribution of Estimates of Causal Estimand", xlab="Treatment Effect")

mean(treatment_effect_estimates)
sd(treatment_effect_estimates)
quantile(treatment_effect_estimates, prob=c(0.025,0.25,0.5,0.75,0.975))

[1] "The treatment effect estimate obtained from random forests is: -3761.77835337285"
  |======================================================================| 100%


quantiles = quantile(new_data$propensity_scores, probs=c(0, 0.6, 0.8, 0.9, 0.95, 1))

subclass_1 = new_data[new_data$propensity_scores<=quantiles[2],]
subclass_2 = new_data[new_data$propensity_scores<=quantiles[3] & new_data$propensity_scores>quantiles[2],]
subclass_3 = new_data[new_data$propensity_scores<=quantiles[4] & new_data$propensity_scores>quantiles[3],]
subclass_4 = new_data[new_data$propensity_scores<=quantiles[5] & new_data$propensity_scores>quantiles[4],]
subclass_5 = new_data[new_data$propensity_scores>quantiles[5],]

number_treated_units = c(sum(subclass_1$TREAT==1), 
                         sum(subclass_2$TREAT==1), 
                         sum(subclass_3$TREAT==1),
                         sum(subclass_4$TREAT==1), 
                         sum(subclass_5$TREAT==1))
number_control_units = c(sum(subclass_1$TREAT==0), 
                         sum(subclass_2$TREAT==0), 
                         sum(subclass_3$TREAT==0), 
                         sum(subclass_4$TREAT==0), 
                         sum(subclass_5$TREAT==0))

number_treated_units
number_control_units

subclass = rep(NA, 1393)

for(i in 1:1393)
{
  if(new_data$propensity_scores[i]<=quantiles[2]) subclass[i] = 1
  if(new_data$propensity_scores[i]<=quantiles[3] & new_data$propensity_scores[i]>quantiles[2]) subclass[i] = 2 
  if(new_data$propensity_scores[i]<=quantiles[4] & new_data$propensity_scores[i]>quantiles[3]) subclass[i] = 3
  if(new_data$propensity_scores[i]<=quantiles[5] & new_data$propensity_scores[i]>quantiles[4]) subclass[i] = 4
  if(new_data$propensity_scores[i]>quantiles[5]) subclass[i] = 5
}

subclassified_data = cbind(new_data, subclass)

covariate_balance = function(subclass)
{
  subclass_t_MARR = mean(subclass$MARR[subclass$TREAT==1])
  subclass_c_MARR = mean(subclass$MARR[subclass$TREAT==0])
  
  subclass_t_NODEGREE = mean(subclass$NODEGREE[subclass$TREAT==1])
  subclass_c_NODEGREE = mean(subclass$NODEGREE[subclass$TREAT==0])
  
  subclass_t_BLACK = mean(subclass$BLACK[subclass$TREAT==1])
  subclass_c_BLACK = mean(subclass$BLACK[subclass$TREAT==0])
  
  subclass_t_HISPANIC = mean(subclass$HISPANIC[subclass$TREAT==1])
  subclass_c_HISPANIC = mean(subclass$HISPANIC[subclass$TREAT==0])
  
  subclass_t_EDUC = mean(subclass$EDUC[subclass$TREAT==1])
  subclass_c_EDUC = mean(subclass$EDUC[subclass$TREAT==0])
  
  subclass_t_AGE = mean(subclass$AGE[subclass$TREAT==1])
  subclass_c_AGE = mean(subclass$AGE[subclass$TREAT==0])
  
  subclass_t_RE74 = mean(subclass$RE74[subclass$TREAT==1])
  subclass_c_RE74 = mean(subclass$RE74[subclass$TREAT==0])
  
  subclass_t_RE75 = mean(subclass$RE75[subclass$TREAT==1])
  subclass_c_RE75 = mean(subclass$RE75[subclass$TREAT==0])
  
  subclass_t_U74 = mean(subclass$U74[subclass$TREAT==1])
  subclass_c_U74 = mean(subclass$U74[subclass$TREAT==0])
  
  subclass_t_U75 = mean(subclass$U75[subclass$TREAT==1])
  subclass_c_U75 = mean(subclass$U75[subclass$TREAT==0])
  
  subclass_t = c(subclass_t_MARR, subclass_t_NODEGREE, subclass_t_BLACK, 
                subclass_t_HISPANIC, subclass_t_EDUC, subclass_t_AGE, 
                subclass_t_RE74, subclass_t_RE75, subclass_t_U74, subclass_t_U75)
  subclass_c = c(subclass_c_MARR, subclass_c_NODEGREE, subclass_c_BLACK, 
                subclass_c_HISPANIC, subclass_c_EDUC, subclass_c_AGE, 
                subclass_c_RE74, subclass_c_RE75, subclass_c_U74, subclass_c_U75)
  
  print(cbind(c("Covariate", 
                "Marriage", 
                "No High School Degree",
                "African-American",
                "Hispanic",
                "Education",
                "Age",
                "1974 Income",
                "1975 Income",
                "Unemployed in 1974",
                "Unemployed in 1975"),
              c("Treated", round(subclass_t,5)),
              c("Control", round(subclass_c,5))))
}

covariate_balance(subclass_1)
covariate_balance(subclass_2)
covariate_balance(subclass_3)
covariate_balance(subclass_4)
covariate_balance(subclass_5)

weights = number_treated_units/(sum(number_treated_units))

      [,1]                    [,2]       [,3]        
 [1,] "Covariate"             "Treated"  "Control"   
 [2,] "Marriage"              "0.83333"  "0.85783"   
 [3,] "No High School Degree" "0.5"      "0.35663"   
 [4,] "African-American"      "0.66667"  "0.35783"   
 [5,] "Hispanic"              "0"        "0.03735"   
 [6,] "Education"             "10.83333" "11.44217"  
 [7,] "Age"                   "30.83333" "32.56265"  
 [8,] "1974 Income"           "13069.45" "13211.0693"
 [9,] "1975 Income"           "12866.6"  "11656.6609"
[10,] "Unemployed in 1974"    "0"        "0.08554"   
[11,] "Unemployed in 1975"    "0"        "0.13373"   
      [,1]                    [,2]      [,3]        
 [1,] "Covariate"             "Treated" "Control"   
 [2,] "Marriage"              "0.6"     "0.70522"   
 [3,] "No High School Degree" "0.3"     "0.48134"   
 [4,] "African-American"      "0.4"     "0.51119"   
 [5,] "Hispanic"              "0.2"     "0.06716"   
 [6,] "Education"             "11.7"    "11.10075"  
 [7,] "Age"                   "29.7"    "30.64925"  
 [8,] "1974 Income"           "5823.02" "7364.91315"
 [9,] "1975 Income"           "3277.96" "5045.04885"
[10,] "Unemployed in 1974"    "0.3"     "0.29851"   
[11,] "Unemployed in 1975"    "0.4"     "0.33582"   
      [,1]                    [,2]         [,3]        
 [1,] "Covariate"             "Treated"    "Control"   
 [2,] "Marriage"              "0.25581"    "0.4375"    
 [3,] "No High School Degree" "0.60465"    "0.64583"   
 [4,] "African-American"      "0.76744"    "0.80208"   
 [5,] "Hispanic"              "0.04651"    "0.09375"   
 [6,] "Education"             "10.30233"   "10.15625"  
 [7,] "Age"                   "28.37209"   "31.125"    
 [8,] "1974 Income"           "4429.32791" "3979.45456"
 [9,] "1975 Income"           "2401.91628" "2690.48185"
[10,] "Unemployed in 1974"    "0.37209"    "0.40625"   
[11,] "Unemployed in 1975"    "0.32558"    "0.41667"   
      [,1]                    [,2]        [,3]        
 [1,] "Covariate"             "Treated"   "Control"   
 [2,] "Marriage"              "0.19355"   "0"         
 [3,] "No High School Degree" "0.67742"   "0.5"       
 [4,] "African-American"      "0.85484"   "0.875"     
 [5,] "Hispanic"              "0.08065"   "0"         
 [6,] "Education"             "10.69355"  "10.75"     
 [7,] "Age"                   "26.8871"   "21.25"     
 [8,] "1974 Income"           "976.9871"  "3639.33808"
 [9,] "1975 Income"           "806.92581" "2523.0121" 
[10,] "Unemployed in 1974"    "0.77419"   "0.5"       
[11,] "Unemployed in 1975"    "0.66129"   "0.25"      
      [,1]                    [,2]        [,3]       
 [1,] "Covariate"             "Treated"   "Control"  
 [2,] "Marriage"              "0.01562"   "0"        
 [3,] "No High School Degree" "0.89062"   "0.83333"  
 [4,] "African-American"      "0.96875"   "0.83333"  
 [5,] "Hispanic"              "0.03125"   "0.16667"  
 [6,] "Education"             "9.78125"   "10.66667" 
 [7,] "Age"                   "21.98438"  "22.66667" 
 [8,] "1974 Income"           "0"         "0"        
 [9,] "1975 Income"           "314.67969" "161.12903"
[10,] "Unemployed in 1974"    "1"         "1"        
[11,] "Unemployed in 1975"    "0.8125"    "0.66667"


options(warn=-1)

number_of_bootstrap_draws = 10^3

observational_data = subclass_1
treatment_effect_estimates_1 = rep(NA, number_of_bootstrap_draws)
observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates_1[i] = mean(c((observational_data_new_treated[,1] - 
                                            predict(observational_randomForest_new_control,
                                                    observational_data_new_treated[,3:12])),
                                           (predict(observational_randomForest_new_treated, 
                                                    observational_data_new_control[,3:12]) - 
                                            observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)


observational_data = subclass_2
treatment_effect_estimates_2 = rep(NA, number_of_bootstrap_draws)
observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates_2[i] = mean(c((observational_data_new_treated[,1] - 
                                            predict(observational_randomForest_new_control,
                                                    observational_data_new_treated[,3:12])),
                                           (predict(observational_randomForest_new_treated, 
                                                    observational_data_new_control[,3:12]) - 
                                            observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)


observational_data = subclass_3
treatment_effect_estimates_3 = rep(NA, number_of_bootstrap_draws)
observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates_3[i] = mean(c((observational_data_new_treated[,1] - 
                                            predict(observational_randomForest_new_control,
                                                    observational_data_new_treated[,3:12])),
                                           (predict(observational_randomForest_new_treated, 
                                                    observational_data_new_control[,3:12]) - 
                                            observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)


observational_data = subclass_4
treatment_effect_estimates_4 = rep(NA, number_of_bootstrap_draws)
observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates_4[i] = mean(c((observational_data_new_treated[,1] - 
                                            predict(observational_randomForest_new_control,
                                                    observational_data_new_treated[,3:12])),
                                           (predict(observational_randomForest_new_treated, 
                                                    observational_data_new_control[,3:12]) - 
                                            observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)


observational_data = subclass_5
treatment_effect_estimates_5 = rep(NA, number_of_bootstrap_draws)
observational_data_treated = observational_data[observational_data$TREAT==1,]
observational_data_control = observational_data[observational_data$TREAT==0,]

progress_bar = txtProgressBar(min=1, max=number_of_bootstrap_draws, style = 3)
for(i in 1:number_of_bootstrap_draws)
{
  observational_data_new_treated = observational_data_treated[sample((1:nrow(observational_data_treated)), replace=TRUE),]
  observational_randomForest_new_treated = randomForest(x=observational_data_new_treated[,3:12], 
                                                        y=observational_data_new_treated[,1])
  observational_data_new_control = observational_data_control[sample((1:nrow(observational_data_control)), replace=TRUE),]
  observational_randomForest_new_control = randomForest(x=observational_data_new_control[,3:12], 
                                                        y=observational_data_new_control[,1])  
  
  treatment_effect_estimates_5[i] = mean(c((observational_data_new_treated[,1] - 
                                            predict(observational_randomForest_new_control,
                                                    observational_data_new_treated[,3:12])),
                                           (predict(observational_randomForest_new_treated, 
                                                    observational_data_new_control[,3:12]) - 
                                            observational_data_new_control[,1])))
  
  setTxtProgressBar(progress_bar, i)
}
close(progress_bar)


overall_treatment_effect_estimates = weights%*%rbind(t(treatment_effect_estimates_1),
                                                     t(treatment_effect_estimates_2),
                                                     t(treatment_effect_estimates_3),
                                                     t(treatment_effect_estimates_4),
                                                     t(treatment_effect_estimates_5))

hist(overall_treatment_effect_estimates, main="Bootstrap Distribution of Estimates of Causal Estimand", xlab="Treatment Effect")
quantile(overall_treatment_effect_estimates, prob=c(0.025,0.25,0.5,0.75,0.975))

  |======================================================================| 100%
  |======================================================================| 100%
  |======================================================================| 100%
  |======================================================================| 100%
  |======================================================================| 100%


#install.packages("Matching")

library(Matching)

observational_data = read.table("Lalonde_observational_data.txt", header=T, sep="")

propensity_score_model = glm(TREAT~EDUC + AGE + RE74 + RE75 + U74 + U75 + BLACK + MARR + HISPANIC +
                                   EDUC:AGE + RE75:MARR + AGE:BLACK + EDUC:U74 + U74:U75 + AGE:MARR + 
                                   AGE:RE75 + EDUC:RE75 + BLACK:MARR + AGE:U74 + AGE:U75,
                            data=observational_data,
                            family = "binomial")

propensity_score_estimates = propensity_score_model$fitted
observed_outcomes = observational_data$RE78
treatment = observational_data$TREAT

matched_dataset = Match(Y=observed_outcomes, X=propensity_score_estimates, Tr=treatment, M=1)
summary(matched_dataset)

matched_dataset_matrix = rbind(observational_data[matched_dataset$index.treated,], 
                               observational_data[matched_dataset$index.control,])

head(matched_dataset_matrix)
tail(matched_dataset_matrix)

Loading required package: MASS


Attaching package: 'MASS'


The following object is masked from 'package:sm':

    muscle


## 
##  Matching (Version 4.9-7, Build Date: 2020-02-05)
##  See http://sekhon.berkeley.edu/matching for additional documentation.
##  Please cite software as:
##   Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching
##   Software with Automated Balance Optimization: The Matching package for R.''
##   Journal of Statistical Software, 42(7): 1-52. 
##

Estimate...  1558.2 
AI SE......  1751 
T-stat.....  0.88987 
p.val......  0.37354 

Original number of observations..............  2675 
Original number of treated obs...............  185 
Matched number of observations...............  185 
Matched number of observations  (unweighted).  548


matched_dataset_balance = MatchBalance(TREAT~EDUC + AGE + RE74 + RE75 + U74 + U75 + BLACK + MARR + HISPANIC +
                                             EDUC:AGE + RE75:MARR + AGE:BLACK + EDUC:U74 + U74:U75 + AGE:MARR + 
                                             AGE:RE75 + EDUC:RE75 + BLACK:MARR + AGE:U74 + AGE:U75,
                                      data=observational_data,
                                      match.out=matched_dataset,
                                      nboots=10)

***** (V1) EDUC *****
                       Before Matching 	 	 After Matching
mean treatment........     10.346 	 	     10.346 
mean control..........     12.117 	 	     10.474 
std mean diff.........    -88.077 	 	    -6.3509 

mean raw eQQ diff.....     1.8595 	 	    0.91058 
med  raw eQQ diff.....          2 	 	          1 
max  raw eQQ diff.....          5 	 	          3 

mean eCDF diff........     0.1091 	 	   0.056911 
med  eCDF diff........    0.01944 	 	   0.037409 
max  eCDF diff........    0.40289 	 	    0.32117 

var ratio (Tr/Co).....    0.42549 	 	    0.77938 
T-test p-value........ < 2.22e-16 	 	    0.50116 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.40289 	 	    0.32117 


***** (V2) AGE *****
                       Before Matching 	 	 After Matching
mean treatment........     25.816 	 	     25.816 
mean control..........     34.851 	 	     24.934 
std mean diff.........    -126.27 	 	     12.324 

mean raw eQQ diff.....     9.0432 	 	     4.7464 
med  raw eQQ diff.....          8 	 	          4 
max  raw eQQ diff.....         17 	 	         15 

mean eCDF diff........    0.23165 	 	     0.1217 
med  eCDF diff........    0.25299 	 	    0.11679 
max  eCDF diff........    0.37714 	 	    0.26825 

var ratio (Tr/Co).....    0.46963 	 	     1.0232 
T-test p-value........ < 2.22e-16 	 	    0.20959 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.37714 	 	    0.26825 


***** (V3) RE74 *****
                       Before Matching 	 	 After Matching
mean treatment........     2095.6 	 	     2095.6 
mean control..........      19429 	 	     2400.2 
std mean diff.........    -354.71 	 	    -6.2332 

mean raw eQQ diff.....      17663 	 	     2828.8 
med  raw eQQ diff.....      18417 	 	       2028 
max  raw eQQ diff.....     102109 	 	      10745 

mean eCDF diff........    0.46806 	 	    0.10757 
med  eCDF diff........    0.54766 	 	    0.10036 
max  eCDF diff........    0.72924 	 	    0.24818 

var ratio (Tr/Co).....    0.13285 	 	    0.86071 
T-test p-value........ < 2.22e-16 	 	    0.49457 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 4.4409e-15 
KS Statistic..........    0.72924 	 	    0.24818 


***** (V4) RE75 *****
                       Before Matching 	 	 After Matching
mean treatment........     1532.1 	 	     1532.1 
mean control..........      19063 	 	       1295 
std mean diff.........    -544.58 	 	     7.3634 

mean raw eQQ diff.....      17978 	 	     5073.1 
med  raw eQQ diff.....      17903 	 	     4876.4 
max  raw eQQ diff.....     131511 	 	      14400 

mean eCDF diff........    0.46947 	 	    0.14409 
med  eCDF diff........    0.53317 	 	    0.10401 
max  eCDF diff........    0.77362 	 	    0.33942 

var ratio (Tr/Co).....   0.056057 	 	     1.3076 
T-test p-value........ < 2.22e-16 	 	    0.26274 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.77362 	 	    0.33942 


***** (V5) U74 *****
                       Before Matching 	 	 After Matching
mean treatment........    0.70811 	 	    0.70811 
mean control..........   0.086345 	 	    0.72625 
std mean diff.........     136.39 	 	    -3.9796 

mean raw eQQ diff.....    0.62162 	 	    0.10219 
med  raw eQQ diff.....          1 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........    0.31088 	 	   0.051095 
med  eCDF diff........    0.31088 	 	   0.051095 
max  eCDF diff........    0.62176 	 	    0.10219 

var ratio (Tr/Co).....     2.6332 	 	     1.0396 
T-test p-value........ < 2.22e-16 	 	     0.5758 


***** (V6) U75 *****
                       Before Matching 	 	 After Matching
mean treatment........        0.6 	 	        0.6 
mean control..........        0.1 	 	    0.56191 
std mean diff.........     101.79 	 	     7.7543 

mean raw eQQ diff.....     0.4973 	 	   0.074818 
med  raw eQQ diff.....          0 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........       0.25 	 	   0.037409 
med  eCDF diff........       0.25 	 	   0.037409 
max  eCDF diff........        0.5 	 	   0.074818 

var ratio (Tr/Co).....     2.6801 	 	    0.97495 
T-test p-value........ < 2.22e-16 	 	    0.40439 


***** (V7) BLACK *****
                       Before Matching 	 	 After Matching
mean treatment........    0.84324 	 	    0.84324 
mean control..........     0.2506 	 	    0.84356 
std mean diff.........     162.56 	 	   -0.08805 

mean raw eQQ diff.....    0.58919 	 	    0.26095 
med  raw eQQ diff.....          1 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........    0.29632 	 	    0.13047 
med  eCDF diff........    0.29632 	 	    0.13047 
max  eCDF diff........    0.59264 	 	    0.26095 

var ratio (Tr/Co).....    0.70739 	 	     1.0017 
T-test p-value........ < 2.22e-16 	 	    0.99313 


***** (V8) MARR *****
                       Before Matching 	 	 After Matching
mean treatment........    0.18919 	 	    0.18919 
mean control..........    0.86627 	 	    0.13094 
std mean diff.........    -172.41 	 	     14.833 

mean raw eQQ diff.....    0.67568 	 	   0.083942 
med  raw eQQ diff.....          1 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........    0.33854 	 	   0.041971 
med  eCDF diff........    0.33854 	 	   0.041971 
max  eCDF diff........    0.67708 	 	   0.083942 

var ratio (Tr/Co).....     1.3308 	 	      1.348 
T-test p-value........ < 2.22e-16 	 	   0.053749 


***** (V9) HISPANIC *****
                       Before Matching 	 	 After Matching
mean treatment........   0.059459 	 	   0.059459 
mean control..........    0.03253 	 	    0.06665 
std mean diff.........     11.357 	 	    -3.0324 

mean raw eQQ diff.....   0.027027 	 	  0.0091241 
med  raw eQQ diff.....          0 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........   0.013465 	 	   0.004562 
med  eCDF diff........   0.013465 	 	   0.004562 
max  eCDF diff........   0.026929 	 	  0.0091241 

var ratio (Tr/Co).....     1.7859 	 	    0.89899 
T-test p-value........    0.13173 	 	    0.78327 


***** (V10) EDUC:AGE *****
                       Before Matching 	 	 After Matching
mean treatment........     266.98 	 	     266.98 
mean control..........      414.7 	 	     254.95 
std mean diff.........    -159.71 	 	     13.009 

mean raw eQQ diff.....     148.93 	 	     45.797 
med  raw eQQ diff.....        127 	 	         27 
max  raw eQQ diff.....        337 	 	        281 

mean eCDF diff........    0.23659 	 	   0.087591 
med  eCDF diff........    0.24338 	 	   0.071168 
max  eCDF diff........    0.47566 	 	     0.2208 

var ratio (Tr/Co).....    0.35759 	 	     1.8184 
T-test p-value........ < 2.22e-16 	 	   0.080997 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 4.9879e-12 
KS Statistic..........    0.47566 	 	     0.2208 


***** (V11) RE75:MARR *****
                       Before Matching 	 	 After Matching
mean treatment........     654.34 	 	     654.34 
mean control..........      17200 	 	     555.82 
std mean diff.........    -577.88 	 	     3.4407 

mean raw eQQ diff.....      17005 	 	     5981.4 
med  raw eQQ diff.....      16471 	 	       1868 
max  raw eQQ diff.....     131511 	 	      18876 

mean eCDF diff........    0.41271 	 	    0.20712 
med  eCDF diff........    0.44038 	 	    0.21168 
max  eCDF diff........     0.7094 	 	    0.34124 

var ratio (Tr/Co).....   0.038805 	 	     1.4752 
T-test p-value........ < 2.22e-16 	 	     0.6237 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........     0.7094 	 	    0.34124 


***** (V12) AGE:BLACK *****
                       Before Matching 	 	 After Matching
mean treatment........     21.908 	 	     21.908 
mean control..........      8.561 	 	     21.284 
std mean diff.........     115.05 	 	     5.3753 

mean raw eQQ diff.....     14.108 	 	     10.363 
med  raw eQQ diff.....         18 	 	          9 
max  raw eQQ diff.....         27 	 	         26 

mean eCDF diff........    0.11639 	 	     0.1547 
med  eCDF diff........   0.028319 	 	   0.062956 
max  eCDF diff........    0.59264 	 	    0.39781 

var ratio (Tr/Co).....    0.54503 	 	     1.0575 
T-test p-value........ < 2.22e-16 	 	    0.60195 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.59264 	 	    0.39781 


***** (V13) EDUC:U74 *****
                       Before Matching 	 	 After Matching
mean treatment........     7.2595 	 	     7.2595 
mean control..........     1.0149 	 	     7.6122 
std mean diff.........     124.84 	 	    -7.0528 

mean raw eQQ diff.....     6.2757 	 	     1.3193 
med  raw eQQ diff.....          8 	 	          0 
max  raw eQQ diff.....         12 	 	          9 

mean eCDF diff........    0.31576 	 	   0.069685 
med  eCDF diff........    0.36187 	 	   0.091241 
max  eCDF diff........    0.62337 	 	    0.11314 

var ratio (Tr/Co).....     2.1073 	 	    0.96825 
T-test p-value........ < 2.22e-16 	 	    0.33387 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	  0.0017973 
KS Statistic..........    0.62337 	 	    0.11314 


***** (V14) U74:U75 *****
                       Before Matching 	 	 After Matching
mean treatment........    0.58919 	 	    0.58919 
mean control..........    0.06506 	 	    0.54711 
std mean diff.........     106.25 	 	     8.5292 

mean raw eQQ diff.....    0.52432 	 	   0.065693 
med  raw eQQ diff.....          1 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........    0.26206 	 	   0.032847 
med  eCDF diff........    0.26206 	 	   0.032847 
max  eCDF diff........    0.52413 	 	   0.065693 

var ratio (Tr/Co).....     3.9992 	 	    0.97685 
T-test p-value........ < 2.22e-16 	 	      0.348 


***** (V15) AGE:MARR *****
                       Before Matching 	 	 After Matching
mean treatment........     5.5568 	 	     5.5568 
mean control..........     30.895 	 	     3.9517 
std mean diff.........    -212.05 	 	     13.432 

mean raw eQQ diff.....     25.324 	 	     6.1679 
med  raw eQQ diff.....         26 	 	          5 
max  raw eQQ diff.....         46 	 	         26 

mean eCDF diff........    0.35457 	 	    0.12156 
med  eCDF diff........    0.35271 	 	    0.10584 
max  eCDF diff........    0.68007 	 	    0.27007 

var ratio (Tr/Co).....     0.5961 	 	     1.2805 
T-test p-value........ < 2.22e-16 	 	   0.086428 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.68007 	 	    0.27007 


***** (V16) AGE:RE75 *****
                       Before Matching 	 	 After Matching
mean treatment........      41167 	 	      41167 
mean control..........     688283 	 	      33602 
std mean diff.........    -650.13 	 	      7.601 

mean raw eQQ diff.....     667667 	 	     189126 
med  raw eQQ diff.....     555000 	 	     120424 
max  raw eQQ diff.....    6219703 	 	     745791 

mean eCDF diff........    0.44102 	 	    0.16092 
med  eCDF diff........    0.47011 	 	    0.12044 
max  eCDF diff........    0.77742 	 	    0.32847 

var ratio (Tr/Co).....   0.027142 	 	     1.3168 
T-test p-value........ < 2.22e-16 	 	     0.2722 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.77742 	 	    0.32847 


***** (V17) EDUC:RE75 *****
                       Before Matching 	 	 After Matching
mean treatment........      15881 	 	      15881 
mean control..........     244584 	 	      13519 
std mean diff.........    -673.13 	 	     6.9515 

mean raw eQQ diff.....     232740 	 	      52650 
med  raw eQQ diff.....     208394 	 	      41370 
max  raw eQQ diff.....    1603275 	 	     161984 

mean eCDF diff........    0.46022 	 	    0.13929 
med  eCDF diff........    0.51628 	 	    0.12682 
max  eCDF diff........    0.75481 	 	    0.31387 

var ratio (Tr/Co).....   0.026613 	 	       1.23 
T-test p-value........ < 2.22e-16 	 	    0.27885 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 < 2.22e-16 
KS Statistic..........    0.75481 	 	    0.31387 


***** (V18) BLACK:MARR *****
                       Before Matching 	 	 After Matching
mean treatment........    0.15676 	 	    0.15676 
mean control..........    0.19759 	 	    0.10621 
std mean diff.........    -11.201 	 	     13.866 

mean raw eQQ diff.....   0.043243 	 	    0.32482 
med  raw eQQ diff.....          0 	 	          0 
max  raw eQQ diff.....          1 	 	          1 

mean eCDF diff........   0.020417 	 	    0.16241 
med  eCDF diff........   0.020417 	 	    0.16241 
max  eCDF diff........   0.040834 	 	    0.32482 

var ratio (Tr/Co).....    0.83791 	 	     1.3925 
T-test p-value........     0.1457 	 	    0.10983 


***** (V19) AGE:U74 *****
                       Before Matching 	 	 After Matching
mean treatment........     18.778 	 	     18.778 
mean control..........      3.502 	 	     18.408 
std mean diff.........     111.61 	 	      2.705 

mean raw eQQ diff.....     15.676 	 	     3.8011 
med  raw eQQ diff.....         19 	 	          0 
max  raw eQQ diff.....         38 	 	         23 

mean eCDF diff........     0.1437 	 	    0.05423 
med  eCDF diff........   0.045566 	 	   0.051095 
max  eCDF diff........    0.62176 	 	    0.17336 

var ratio (Tr/Co).....     1.3476 	 	     1.1014 
T-test p-value........ < 2.22e-16 	 	    0.76879 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 1.4081e-07 
KS Statistic..........    0.62176 	 	    0.17336 


***** (V20) AGE:U75 *****
                       Before Matching 	 	 After Matching
mean treatment........     15.984 	 	     15.984 
mean control..........     3.9928 	 	      14.83 
std mean diff.........     83.066 	 	     7.9911 

mean raw eQQ diff.....     12.443 	 	     3.0547 
med  raw eQQ diff.....         14 	 	          0 
max  raw eQQ diff.....         33 	 	         23 

mean eCDF diff........    0.11256 	 	   0.046556 
med  eCDF diff........   0.034245 	 	   0.041971 
max  eCDF diff........        0.5 	 	    0.14234 

var ratio (Tr/Co).....     1.3444 	 	    0.99734 
T-test p-value........ < 2.22e-16 	 	    0.42041 
KS Bootstrap p-value.. < 2.22e-16 	 	 < 2.22e-16 
KS Naive p-value...... < 2.22e-16 	 	 3.0159e-05 
KS Statistic..........        0.5 	 	    0.14234 


Before Matching Minimum p.value: < 2.22e-16 
Variable Name(s): EDUC AGE RE74 RE75 U74 U75 BLACK MARR EDUC:AGE RE75:MARR AGE:BLACK EDUC:U74 U74:U75 AGE:MARR AGE:RE75 EDUC:RE75 AGE:U74 AGE:U75  Number(s): 1 2 3 4 5 6 7 8 10 11 12 13 14 15 16 17 19 20 

After Matching Minimum p.value: < 2.22e-16 
Variable Name(s): EDUC AGE RE74 RE75 EDUC:AGE RE75:MARR AGE:BLACK EDUC:U74 AGE:MARR AGE:RE75 EDUC:RE75 AGE:U74 AGE:U75  Number(s): 1 2 3 4 10 11 12 13 15 16 17 19 20

Unit	$X_i$	$Y_i(0)$	$Y_i(1)$
1	$X_1$	$Y_1(0)$	$Y_1(1)$
2	$X_2$	$Y_2(0)$	$Y_2(1)$
$\vdots$	$\vdots$	$\vdots$	$\vdots$
$N$	$X_N$	$Y_N(0)$	$Y_N(1)$

Unit	$X_i$	Control (0) or Treatment (1)?	$Y_i(0)$	$Y_i(1)$	$Y_i(1) - Y_i(0)$
1	$X_1$	1	?	✔	?
2	$X_2$	0	✔	?	?
$\vdots$	$\vdots$	$\vdots$	$\vdots$	$\vdots$	$\vdots$
$N$	$X_N$	1	?	✔	?

Subject	$Y_i(0)$	$Y_i(1)$	$Y_i(1) - Y_i(0)$
1	1	7	6
2	6	5	-1
3	1	5	4
4	8	7	-1
5	2	3	1
6	6	3	-3

	4	5	1

Subject	Drug (0) or Surgery (1)?	$Y_i(0)$	$Y_i(1)$	$Y_i(1) - Y_i(0)$
1	1	?	7	?
2	0	6	?	?
3	1	?	5	?
4	0	8	?	?
5	1	?	3	?
6	0	6	?	?

Subject	Drug (0) or Surgery (1)?	$Y_i(0)$	$Y_i(1)$	$Y_i(1) - Y_i(0)$
1	1	$\leq 7$	7	$\geq 0$
2	0	6	$\leq 6$	$\leq 0$
3	1	$\leq 5$	5	$\geq 0$
4	0	8	$\leq 8$	$\leq 0$
5	1	$\leq 3$	3	$\geq 0$
6	0	6	$\leq 6$	$\leq 0$

Experimental Unit $i$	$\mathbf{X}_i$	$W_i$	$Y_{i}(0)$	$Y_{i}(1)$
$1$	$\mathbf{X}_1$	$W_1$	$Y_{1}(0)$	$Y_{1}(1)$
$2$	$\mathbf{X}_2$	$W_2$	$Y_{2}(0)$	$Y_{2}(1)$
$\vdots$	$\vdots$	$\vdots$	$\vdots$	$\vdots$
$N$	$\mathbf{X}_N$	$W_N$	$Y_{N}(0)$	$Y_{N}(1)$

	MARR	NODEGREE	BLACK	HISPANIC	EDUC	AGE	RE74	RE75	U74	U75	TREAT	RE78
	<int>	<int>	<int>	<int>	<int>	<int>	<dbl>	<dbl>	<int>	<int>	<int>	<dbl>
1	0	1	1	0	10	23	0	0	1	1	0	0.0
2	0	0	0	0	12	26	0	0	1	1	0	12383.7
3	0	1	1	0	9	22	0	0	1	1	0	0.0
4	0	1	1	0	9	18	0	0	1	1	0	10740.1
5	0	1	1	0	11	45	0	0	1	1	0	11796.5
6	0	1	1	0	9	18	0	0	1	1	0	9227.1

Covariate	$\bar{X}_1$	$\bar{X}_0$	$p$-value
Marriage	$0.19$	$0.15$	$0.327$
No Degree	$0.71$	$0.83$	$0.0014$
Black	$0.84$	$0.83$	$0.649$
Hispanic	$0.06$	$0.11$	$0.076$
Years Education	$10.35$	$10.09$	$0.14$
Age	$25.82$	$25.05$	$0.265$
$1974$ Earnings	$2095.57$	$2107.03$	$0.98$
$1975$ Earnings	$1532.07$	$1266.91$	$0.382$
Unemployed $1974$	$0.71$	$0.75$	$0.326$
Unemployed $1975$	$0.60$	$0.68$	$0.065$

Covariate	$\bar{X}_1$	$\bar{X}_0$	$p$-value
Marriage	$0.19$	$0.87$	$\approx 0$
No Degree	$0.71$	$0.31$	$\approx 0$
Black	$0.84$	$0.25$	$\approx 0$
Hispanic	$0.06$	$0.03$	$0.05$
Years Education	$10.35$	$12.1$	$\approx 0$
Age	$25.82$	$34.85$	$\approx 0$
$1974$ Earnings	$2095.57$	$19429$	$\approx 0$
$1975$ Earnings	$1532.07$	$19063$	$\approx 0$
Unemployed $1974$	$0.71$	$0.09$	$\approx 0$
Unemployed $1975$	$0.60$	$0.1$	$\approx 0$

Covariate	$\bar{X}_1$	$\bar{X}_0$	$p$-value
Marriage	$0.19$	$0.78$	$\approx 0$
No Degree	$0.71$	$0.41$	$\approx 0$
Black	$0.84$	$0.43$	$\approx 0$
Hispanic	$0.06$	$0.05$	$0.66$
Years Education	$10.35$	$11.3$	$\approx 0$
Age	$25.82$	$31.9$	$\approx 0$
$1974$ Earnings	$2095.57$	$11051$	$\approx 0$
$1975$ Earnings	$1532.07$	$936$	$\approx 0$
Unemployed $1974$	$0.71$	$0.17$	$\approx 0$
Unemployed $1975$	$0.60$	$0.2$	$\approx 0$

A data.frame: 6 × 13
	RE78	TREAT	MARR	NODEGREE	BLACK	HISPANIC	EDUC	AGE	RE74	RE75	U74	U75	TYPE
	<dbl>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<dbl>	<dbl>	<int>	<int>	<fct>
2473	17732.72	0	1	1	1	0	7	50	20384.21	17008.06	0	0	OBS
2489	17732.72	0	1	1	1	0	6	48	21551.94	16112.90	0	0	OBS
2495	31032.26	0	1	1	1	0	10	22	21551.94	25064.52	0	0	OBS
2547	29554.53	0	1	1	1	0	11	44	24294.91	35806.45	0	0	OBS
2548	29554.53	0	1	1	1	0	11	44	24294.91	35806.45	0	0	OBS
2607	45809.52	0	0	1	1	0	11	36	29389.00	25982.95	0	0	OBS

Purdue Krannert-Statistics Causal Machine Learning for Novel Settings Boot Camp¶

Introduction to Fundamental Concepts in Causal Inference and ML Approaches of Causal Inference¶

Arman Sabbaghi Department of Statistics Purdue University

August 19, 2022¶

Outline¶

Part 1: The Fundamentals of the Rubin Causal Model¶

Part 2: Applications of Machine Learning Algorithms for Causal Inference¶

Part 3: Designing Observational Studies for Valid Causal Inference (Time Permitting)¶

Part 1: The Fundamentals of the Rubin Causal Model¶

What is Causal Inference? Why Do We Care? Can We Do It?¶

The Rubin Causal Model: Science, Learning, and Decisions¶

Science¶

The Fundamental Problem of Causal Inference¶

Subtleties with Potential Outcomes and Causal Effects for Multiple Units¶

Big Assumption for Causal Inference: SUTVA¶

Causal Estimands Under SUTVA¶

The Important Role of the Assignment Mechanism¶

Example: The Perfect Doctor (Imbens & Rubin, 2015: p. 14 - 15)¶

The Importance of the Assignment Mechanism¶

Notation for Covariates and Potential Outcomes¶

Notation for Treatment Assignment¶

Representation of Observed Outcomes¶

Definition of the Assignment Mechanism¶

Regularity Assumptions and the Propensity Score¶

Individualistic Assignment¶

Probabilistic Assignment¶

Unconfounded Assignment¶

Part 2: Applications of Machine Learning Algorithms for Causal Inference¶

Machine Learning for Causal Inference¶

Example: Evaluating a Job Training Program (LaLonde, 1986) via Random Forests¶

Perspectives on the Use of Machine Learning for Causal Inference Under the Rubin Causal Model¶

Finite-Population Perspective: Multiple Imputation of Missing Potential Outcomes¶

Bayesian Implementations of Multiple Imputation Methods¶

Bootstrap Approximation to the Bayesian Implementation¶

Example: Analyzing the Job Training Program Experiment With A Random Forest Model for the Potential Outcomes¶

Comparing the Results from the Random Forest Algorithm to those from the Standard Linear Model¶

Part 3: Designing Observational Studies for Valid Causal Inference¶

The Need to Design Observational Studies to Reduce Bias in Their Causal Inferences¶

Example: Covariate Balance in the Job Training Program¶

Studies of LaLonde (1986) and Dehejia & Wahba (1999)¶

Naive Machine Learning for the Observational Study¶

Covariate Balance in the Observational Study¶

Complication in the Observational Study: Covariate Imbalance¶

The Propensity Score and Subclassification/Matching¶

An Important Note for the Design of Observational Studies¶

Do not look at observed outcomes during the design of an observational study (Imbens & Rubin, 2015: p. 276).¶

Initial Design of the Observational Study¶

Covariate Balance Among Remaining Units¶

Machine Learning for the Remaining Units¶

Subclassification on the Estimated Propenstiy Scores¶

Covariate Balance Across Subclasses¶

Final Analysis for the Designed Observational Study¶

Analyzing Designed Observational Studies¶

Supplementary Material¶

Addressing Controversies Over Causal Inference Based on Linear Regression¶

Propensity Score and Their Theoretical Properties¶

Individualistic Assignment¶

Probabilistic Assignment¶

Unconfounded Assignment¶

Formal Derivations of the Properties of Propensity Scores¶

Treatment Assignment Does Not Depend on Covariates Given Propensity Scores¶

Unconfoundedness Given the Propensity Score¶

The Propensity Score is the Coarsest Balancing Score¶

The Importance and Controversy of Unconfoundedness¶

The Superpopulation Perspective in Observational Studies¶

The Central Role of the Propensity Score¶

Do not look at observed outcomes during the design of an observational study (Imbens & Rubin, 2015: p. 276).¶

General Procedure for Estimating Propensity Scores and Designing Observational Studies¶

Constructing Strata Based on Estimated Propensity Scores¶

Assessing Covariate Balance Based on Propensity Score Estimates¶

Matching for the Design of an Observational Study¶

Example: Propensity Score Matching for the Job Training Program Study via the "Matching" Package¶

References¶

Arman Sabbaghi
Department of Statistics
Purdue University